Loading the Libraries

# package loading
library(RnaSeqSampleSize)
## Loading required package: ggplot2
## Loading required package: RnaSeqSampleSizeData
## Loading required package: edgeR
## Loading required package: limma
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(purrr)
library(ggplot2)
library(biomaRt)
set.seed(538734517)

Executive Summary:

Laying out the Study

Parsing DataMat

# file parsing and cleaning
d <- read.delim("/Users/paulcao/Downloads/GSE184156_RNAseq_ECM_rnaseq_featurecounts.txt",
    header = TRUE, sep = "\t", skip = 1)
dataMat <- d[, c(10, 11, 12, 16, 17, 18)]
rownames(dataMat) <- d$Geneid
colnames(dataMat) <- c("LIVER1", "LIVER2", "LIVER3", "NO13", "NO14", "NO15")

# drop rows containing n/a dataMat <- na.omit(dataMat)
head(dataMat)
##             LIVER1 LIVER2 LIVER3 NO13 NO14 NO15
## DDX11L1          0      0      0    0    0    0
## WASH7P          63     69     37   69   53   48
## MIR6859-1       20     16      8   28   21   20
## MIR1302-2HG      0      0      0    0    0    0
## MIR1302-2        0      0      0    0    0    0
## FAM138A          0      0      0    0    0    0
dim(dataMat)
## [1] 39011     6

Sourcing the Cadherin and Integrin Genes

Extracting all genes from GeneCards where the gene-name contains either ‘cadherin’ or ‘integrin’ (93 genes)

Parsing Gene IDs

genelist <- read.delim("/Users/paulcao/cadherin_and_integrin.txt", header=TRUE, sep="\t")
genelist <- genelist[, c(2, 3)]
colnames(genelist) <- c("Gene.name", "Gene.stable.ID")
head(genelist)
##   Gene.name Gene.stable.ID
## 1      CDH1    GC16P068737
## 2      CDH2    GC18M028088
## 3      CDH5    GC16P066366
## 4      CDH3    GC16P068637
## 5     CDH13    GC16P082626
## 6     CDH11    GC16M064943
dim(genelist)
## [1] 78  2

Power analysis

RNAseqsamplesize [@Zhao2018] was used to do the analysis. With expected fold change between groups = 2, FDR set = 0.01 and number of samples = 5, power was computed to be 0.0727 for the selected genes of interest which is very low. Dispersion was also computed as 0.00385.

# estimate gene read count and dispersion distribution
distribution <- est_count_dispersion(dataMat, group = c(0, 0, 0, 1, 1, 1))
## Disp = 0.00385 , BCV = 0.062
# power estimation some gene IDs are not in dataset because they are alleles on
# alternative sequences (rapsn and Fbxo32)
power <- est_power_distribution(n = 5, f = 0.01, rho = 2, distributionObject = distribution,
    selectedGenes = genelist$Gene.name, storeProcess = TRUE)
mean(power$power)  # 0.07266288
## [1] 0.07266288
power2 <- est_power_distribution(n = 6, f = 0.01, rho = 2, distributionObject = distribution,
    selectedGenes = genelist$Gene.name, storeProcess = TRUE)
mean(power2$power)
## [1] 0.1261367

est_power_distribution

From changing around FDR (fdr parameter in est_power_curve()) and coverage (lambda0 parameter in est_power_curve()) as well as making an optimization plot, it appears that at least 10 samples are needed with an average coverage > 15 reads/gene depending on desired FDR in order to achieve >80% power. To be precise, 18 samples with 10 coverage and FDR=0.05 will give 83% power. To be safe, for the genes of interest we would need at least 21 samples to achieve 80% power.

Red line is FDR=0.01, coverage=5. Blue line is FDR=0.05, coverage=5. Purple line is FDR=0.01, coverage=10. Green line is FDR=0.05, coverage=10. Yellow line is FDR=0.05, coverage=20.

Red line is FDR=0.01, coverage=5. Blue line is FDR=0.05, coverage=5. Purple line is FDR=0.01, coverage=10. Green line is FDR=0.05, coverage=10. Yellow line is FDR=0.05, coverage=20.

est_power_curve

Blue to brown gradient shows power from 0 to 1. Here FDR=0.01.

Blue to brown gradient shows power from 0 to 1. Here FDR=0.01.

Iterating and calculating powers through different sample-size (n):

est_power(n = 8, lambda0 = 20, phi0 = 0.07154, f = 0.05, m = 52000, m1 = 71)
## [1] 0.51
power10 <- est_power_distribution(n = 10, f = 0.05, rho = 2, distributionObject = distribution,
    selectedGenes = genelist$Gene.name, storeProcess = TRUE)
## Warning in selecteGeneByName(selectedGenes, distributionObject): 33 selectedGenes were not found in distributionObject, discard them for further analysis
mean(power10$power)
## [1] 0.5767527
power13 <- est_power_distribution(n = 13, f = 0.05, rho = 2, distributionObject = distribution,
    selectedGenes = genelist$Gene.name, storeProcess = TRUE)
## Warning in selecteGeneByName(selectedGenes, distributionObject): 33 selectedGenes were not found in distributionObject, discard them for further analysis
mean(power13$power)
## [1] 0.7247843
power15 <- est_power_distribution(n = 15, f = 0.05, rho = 2, distributionObject = distribution,
    selectedGenes = genelist$Gene.name, storeProcess = TRUE)
## Warning in selecteGeneByName(selectedGenes, distributionObject): 33 selectedGenes were not found in distributionObject, discard them for further analysis
mean(power15$power)
## [1] 0.7799651
power18 <- est_power_distribution(n = 18, f = 0.05, rho = 2, minAveCount = 15, distributionObject = distribution,
    selectedGenes = genelist$Gene.name, storeProcess = TRUE)
## Warning in selecteGeneByName(selectedGenes, distributionObject): 33 selectedGenes were not found in distributionObject, discard them for further analysis
mean(power18$power)
## [1] 0.8300565

Extract the mean gene read count across all samples:

gene_readcounts <- distribution$pseudo.counts.mean[which(names(distribution$pseudo.counts.mean) %in% genelist$Gene.name)]
gene_dispersions <- distribution$tagwise.dispersion[which(names(distribution$pseudo.counts.mean) %in% genelist$Gene.name)]

head(gene_readcounts)
##      ITGB3BP       CELSR2       ITGA10     ITGB1BP1        ITGB6        ITGA6 
##   448.611665 18530.060807    31.379411  2193.086763     1.070137   136.017278
mean(gene_readcounts)
## [1] 1384.193
mean(gene_dispersions)
## [1] 0.02174131
ggplot(data=data.frame(counts=as.numeric(gene_readcounts),name=as.character(genelist$Gene.name[genelist$Gene.name %in% names(gene_readcounts)]))) + geom_bar(aes(x=reorder(name,-counts),y=counts),stat="identity") + scale_y_log10() + coord_flip() + xlab("Gene name") + ylab("Mean count per gene across samples") + theme(axis.text.y = element_text(size=7)) + ggtitle("Mean count per gene across samples from experimental data") + labs(caption="Each sample has approximately 50 million reads. Experiment ID GSE58669")

gene_readcounts
##      ITGB3BP       CELSR2       ITGA10     ITGB1BP1        ITGB6        ITGA6 
## 4.486117e+02 1.853006e+04 3.137941e+01 2.193087e+03 1.070137e+00 1.360173e+02 
##        ITGAV        ITGA9       CELSR3 LOC100130345        ITGB5         IBSP 
## 2.250075e+03 4.167625e+00 1.274939e+03 3.811665e+00 3.468786e+03 1.028280e+01 
##        CDH18        CDH10        ITGA1        ITGA2        CDHR2        ITGB8 
## 2.441045e+01 1.718927e+00 3.695383e+01 4.949955e+02 2.130482e+00 1.155578e+01 
##        CDHR3        CPED1        ITGB1          ILK        ITFG2        ITGB7 
## 5.084312e+01 3.680381e+00 4.714837e+03 1.733754e+03 6.097740e+02 3.540124e+01 
##        ITGA5        ITGA7        CDH24        ITGAL        ITGAM        ITGAX 
## 2.553918e+02 4.614826e+01 1.193567e+03 2.505413e+00 3.133014e+01 4.150712e+00 
##        ITFG1         CDH5         CDH3         CDH1        CDH15        ITGAE 
## 8.586064e+02 3.100385e+00 2.984464e+03 1.360194e+04 2.725789e+01 6.049293e+02 
##       ITGA2B        ITGB3        ITGA3        ITGB4        CDH22        CDH26 
## 3.647727e+02 1.068397e+00 1.682625e+03 1.274514e+03 9.652477e-01 5.922976e+01 
##        ITGB2       CELSR1     ITGB1BP2 
## 2.753119e+01 3.178135e+03 1.410967e+01

Sample_Size_Distribution (genelist of all Cadherin and Integrin Genes)

ss<-sample_size_distribution(f=0.05,distributionObject=distribution,selectedGenes=genelist$Gene.name,storeProcess = TRUE)
## Warning in selecteGeneByName(selectedGenes, distributionObject): 33 selectedGenes were not found in distributionObject, discard them for further analysis
ss
## $iter
## [1] 7
## 
## $f.root
## [1] 0.0155463
## 
## $root
## [1] 16
## 
## $process
##       N       Power
## [1,]  1 0.005460439
## [2,]  9 0.587380911
## [3,] 13 0.756353866
## [4,] 15 0.799667357
## [5,] 16 0.815546304
## [6,] 17 0.829616720
## [7,] 33 0.925866132
## 
## $parameters
## power     m    m1   fdr     w   rho     k 
## 8e-01 1e+04 1e+02 5e-02 1e+00 2e+00 1e+00
names <- as.character(genelist$Gene.name[genelist$Gene.name %in% names(gene_readcounts)])
to_drop <- names[which(gene_readcounts < 10)]
ss_drop10 <- sample_size_distribution(f=0.1,distributionObject=distribution,selectedGenes=names(which(gene_readcounts>=10)),storeProcess = TRUE)
# 76.7% power with N=9, 83.1% power with N=10
ss_drop10
## $iter
## [1] 7
## 
## $f.root
## [1] 0.003185019
## 
## $root
## [1] 9
## 
## $process
##       N     Power
## [1,]  1 0.0166177
## [2,]  5 0.4009260
## [3,]  7 0.6373315
## [4,]  8 0.7337988
## [5,]  9 0.8031850
## [6,] 17 0.9867463
## [7,] 33 0.9960203
## 
## $parameters
## power     m    m1   fdr     w   rho     k 
## 8e-01 1e+04 1e+02 1e-01 1e+00 2e+00 1e+00

Sample_Size_Distribution (genelist of all genes with read counts >= 80)

to_drop80 <- names[which(gene_readcounts < 80)]
ss_drop80 <- sample_size_distribution(f=0.1,distributionObject=distribution,selectedGenes=names(which(gene_readcounts>=80)),storeProcess = TRUE)
# basically when rho goes up and genes go down it works
# so what exactly is rho?
# basically effect size... so question: do we expect it to be bigger or smaller than in the 2 group situation?
ss_drop80
## $iter
## [1] 7
## 
## $f.root
## [1] 0.04811613
## 
## $root
## [1] 9
## 
## $process
##       N      Power
## [1,]  1 0.01870999
## [2,]  5 0.44171403
## [3,]  7 0.68761390
## [4,]  8 0.78338012
## [5,]  9 0.84811613
## [6,] 17 0.99309292
## [7,] 33 0.99600706
## 
## $parameters
## power     m    m1   fdr     w   rho     k 
## 8e-01 1e+04 1e+02 1e-01 1e+00 2e+00 1e+00

Sample_Size_Distribution (genelist of all genes with read counts >= 200)

to_drop200 <- names[which(gene_readcounts < 200)]
ss_drop200 <- sample_size_distribution(f=0.1,distributionObject=distribution,selectedGenes=names(which(gene_readcounts>=200)),storeProcess = TRUE)

Sample_Size_Distribution (genelist of all genes with read counts >= 80 and minimum fold change >=2.5)

ss_drop80_fold2.5 <- sample_size_distribution(f=0.1,rho=2.5,distributionObject=distribution,selectedGenes=names(which(gene_readcounts>=80)),storeProcess = TRUE)
ss_drop80_fold2.5
## $iter
## [1] 7
## 
## $f.root
## [1] 0.04257095
## 
## $root
## [1] 5
## 
## $process
##       N      Power
## [1,]  1 0.05866677
## [2,]  3 0.49404035
## [3,]  4 0.70411021
## [4,]  5 0.84257095
## [5,]  9 0.99070871
## [6,] 17 0.99600868
## [7,] 33 0.99600717
## 
## $parameters
##   power       m      m1     fdr       w     rho       k 
##     0.8 10000.0   100.0     0.1     1.0     2.5     1.0

Sample_Size_Distribution (genelist of all genes that start with ‘CAH’ and ‘ITGA’; a more restrictive list of Cadherin and Integrin without pseudogenes)

kept <- append(rownames(genelist[grepl("CDH[0-9]+$", genelist$Gene.name), ]), rownames(genelist[grepl("ITGA[0-9]+$", genelist$Gene.name), ]))
genelist[kept,]$Gene.name[which(!genelist[kept,]$Gene.stable.ID %in% names(distribution$pseudo.counts.mean))]
##  [1] "CDH1"   "CDH2"   "CDH5"   "CDH3"   "CDH13"  "CDH11"  "CDH17"  "CDH15" 
##  [9] "CDH4"   "CDH12"  "CDH6"   "CDH10"  "CDH16"  "CDH23"  "CDH18"  "CDH8"  
## [17] "CDH9"   "CDH7"   "CDH22"  "CDH20"  "CDH19"  "CDH26"  "CDH24"  "ITGA7" 
## [25] "ITGA6"  "ITGA4"  "ITGA5"  "ITGA2"  "ITGA3"  "ITGA1"  "ITGA9"  "ITGA11"
## [33] "ITGA8"  "ITGA10"
kept_final <- kept[which(genelist[kept,]$Gene.name %in% names(distribution$pseudo.counts.mean))]
power_kept <- est_power_distribution(n=6,f=0.1,m=52000,m1=4000,distributionObject=distribution,selectedGenes=genelist[kept_final,]$Gene.name,storeProcess = TRUE)
mean(power_kept$power) #32% power with just kept genes and 6 samples each group
## [1] 0.4895173
ss_kept <- sample_size_distribution(f=0.1,rho=2,distributionObject=distribution,selectedGenes=genelist[kept,]$Gene.name,storeProcess=TRUE)
## Warning in selecteGeneByName(selectedGenes, distributionObject): 17 selectedGenes were not found in distributionObject, discard them for further analysis
ss_kept
## $iter
## [1] 7
## 
## $f.root
## [1] 0.01177878
## 
## $root
## [1] 14
## 
## $process
##       N      Power
## [1,]  1 0.01292141
## [2,]  9 0.64958749
## [3,] 13 0.79281187
## [4,] 14 0.81177878
## [5,] 15 0.82756311
## [6,] 17 0.85210797
## [7,] 33 0.93997869
## 
## $parameters
## power     m    m1   fdr     w   rho     k 
## 8e-01 1e+04 1e+02 1e-01 1e+00 2e+00 1e+00

List all of the Found Genes in the RNASeq expression data(not just the genelist) [45 Genes]

found <- genelist[which(genelist$Gene.name %in% names(distribution$pseudo.counts.mean)),]
nrow(found)
## [1] 45
found <- genelist[which(genelist$Gene.name %in% names(distribution$pseudo.counts.mean)),]
found
##       Gene.name Gene.stable.ID
## 1          CDH1    GC16P068737
## 3          CDH5    GC16P066366
## 4          CDH3    GC16P068637
## 8         CDH15    GC16P089171
## 12        CDH10    GC05M024522
## 15        CDH18    GC05M019508
## 19        CDH22    GC20M046173
## 22        CDH26    GC20P059958
## 23       CELSR1    GC22M046360
## 24        CDHR3    GC07P105876
## 25        CDH24    GC14M023047
## 26       CELSR2    GC01P109250
## 27       CELSR3    GC03M048641
## 29        CDHR2    GC05P176542
## 33        CPED1    GC07P120988
## 37 LOC100130345    GC03M063749
## 38        ITGA7    GC12M055684
## 39        ITGB1    GC10M033645
## 40        ITGB3    GC17P062247
## 41        ITGA6    GC02P172427
## 42        ITGAV    GC02P186589
## 43        ITGB2    GC21M044885
## 44        ITGB4    GC17P075721
## 46        ITGA5    GC12M055005
## 47        ITGA2    GC05P052989
## 48        ITGA3    GC17P050055
## 49        ITGAL    GC16P030472
## 50        ITGB6    GC02M160099
## 51       ITGA2B    GC17M051904
## 52        ITGB7    GC12M053191
## 53        ITGA1    GC05P052788
## 54        ITGAM    GC16P042850
## 55        ITGB5    GC03M124761
## 56          ILK    GC11P006604
## 57        ITGAX    GC16P042860
## 58        ITGA9    GC03P037468
## 59        ITGAE    GC17M006905
## 60        ITGB8    GC07P020329
## 61     ITGB1BP1    GC02M009391
## 65       ITGA10    GC01M145891
## 67     ITGB1BP2    GC0XP071333
## 68         IBSP    GC04P087799
## 69      ITGB3BP    GC01M063440
## 70        ITFG2    GC12P002812
## 71        ITFG1    GC16M047156
n <- 1:15
gene_powers <- lapply(n, function(x) est_power_distribution(n=x, f=0.1, rho=2, distributionObject=distribution, m=52000,m1=4000,selectedGenes=found$Gene.name,storeProcess=TRUE)$power)
num_genes <- sapply(gene_powers,function(x) sum(x>0.8))
gene_powers_keptfinal <- lapply(n, function(x) est_power_distribution(n=x, f=0.1, rho=2, distributionObject=distribution, m=52000,m1=4000,selectedGenes=genelist[kept_final,]$Gene.name,storeProcess=TRUE)$power)

# CDH1:
musk_id <- which(found$Gene.name=='CDH1')
musk_power <- sapply(gene_powers, function(x) x[musk_id])
total_power <- sapply(gene_powers, function(x) mean(x))

total_keptfinal <- sapply(gene_powers_keptfinal, function(x) mean(x))

# Number of genes vs number of samples at power=0.8
d <- data.frame(n,num_genes)
ggplot(data=d,aes(x=n,y=num_genes,label=num_genes)) + geom_line() + geom_point() + theme_bw() + ylab("Number of genes with power > 0.8") + xlab("Number of samples in each group") + ggtitle("Number of genes of interest with sufficient power to detect\n as number of samples increases") + labs(caption="FDR=0.1,log fold change=2") + geom_text(data=subset(d, n>7),vjust=0,nudge_y=1) + scale_x_continuous(breaks=seq(1,15,2)) + scale_y_continuous(breaks=seq(0,70,10))

# Gene powers random
gene_powers_random <- sapply(n, function(x) est_power_distribution(n=x, f=0.1, rho=2, distributionObject=distribution, m=52000,m1=4000))

# Power to detect musk and total power vs number of samples
d2 <- data.frame(n,musk_power,total_power,gene_powers_random) %>% tidyr::gather("type","power",musk_power,total_power,gene_powers_random)
ggplot(data=d2,aes(x=n)) + geom_line(aes(y=power,color=type)) + geom_point(aes(y=power,color=type)) + theme_bw() + ylab("Power") + xlab("Number of samples in each group") + ggtitle("Power for CDH1 gene and total power\n as number of samples increases") + labs(caption="FDR=0.1,log fold change=2,number of random genes=100") + geom_hline(yintercept=0.8,linetype='dashed') + scale_color_discrete(name="Power",breaks=c("musk_power","total_power","gene_powers_random"),labels=c("CDH1","Total for interested genes","Random genes")) + scale_x_continuous(breaks=seq(1,15,2)) + scale_y_continuous(breaks=seq(0,1,0.2))

Estimating the Power Curve using Default m and m1 parameters

length(names(which(distribution$pseudo.counts.mean>=10)))
## [1] 14065
found <- genelist[which(genelist$Gene.name %in% names(distribution$pseudo.counts.mean)),]
found
##       Gene.name Gene.stable.ID
## 1          CDH1    GC16P068737
## 3          CDH5    GC16P066366
## 4          CDH3    GC16P068637
## 8         CDH15    GC16P089171
## 12        CDH10    GC05M024522
## 15        CDH18    GC05M019508
## 19        CDH22    GC20M046173
## 22        CDH26    GC20P059958
## 23       CELSR1    GC22M046360
## 24        CDHR3    GC07P105876
## 25        CDH24    GC14M023047
## 26       CELSR2    GC01P109250
## 27       CELSR3    GC03M048641
## 29        CDHR2    GC05P176542
## 33        CPED1    GC07P120988
## 37 LOC100130345    GC03M063749
## 38        ITGA7    GC12M055684
## 39        ITGB1    GC10M033645
## 40        ITGB3    GC17P062247
## 41        ITGA6    GC02P172427
## 42        ITGAV    GC02P186589
## 43        ITGB2    GC21M044885
## 44        ITGB4    GC17P075721
## 46        ITGA5    GC12M055005
## 47        ITGA2    GC05P052989
## 48        ITGA3    GC17P050055
## 49        ITGAL    GC16P030472
## 50        ITGB6    GC02M160099
## 51       ITGA2B    GC17M051904
## 52        ITGB7    GC12M053191
## 53        ITGA1    GC05P052788
## 54        ITGAM    GC16P042850
## 55        ITGB5    GC03M124761
## 56          ILK    GC11P006604
## 57        ITGAX    GC16P042860
## 58        ITGA9    GC03P037468
## 59        ITGAE    GC17M006905
## 60        ITGB8    GC07P020329
## 61     ITGB1BP1    GC02M009391
## 65       ITGA10    GC01M145891
## 67     ITGB1BP2    GC0XP071333
## 68         IBSP    GC04P087799
## 69      ITGB3BP    GC01M063440
## 70        ITFG2    GC12P002812
## 71        ITFG1    GC16M047156
n <- 1:20
gene_powers <- lapply(n, function(x) est_power_distribution(n=x, f=0.05, rho=2, distributionObject=distribution, m=14065,m1=78,selectedGenes=found$Gene.name,storeProcess=TRUE)$power)
num_genes <- sapply(gene_powers,function(x) sum(x>0.8))
gene_powers_keptfinal <- lapply(n, function(x) est_power_distribution(n=x, f=0.05, rho=2, distributionObject=distribution, m=14065,m1=78,selectedGenes=genelist[kept_final,]$Gene.name,storeProcess=TRUE)$power)

# CDH1:
musk_id <- which(found$Gene.name=='CDH1')
musk_power <- sapply(gene_powers, function(x) x[musk_id])
total_power <- sapply(gene_powers, function(x) mean(x))

total_keptfinal <- sapply(gene_powers_keptfinal, function(x) mean(x))

# Number of genes vs number of samples at power=0.8
d <- data.frame(n,num_genes)
ggplot(data=d,aes(x=n,y=num_genes,label=num_genes)) + geom_line() + geom_point() + theme_bw() + ylab("Number of genes with power > 0.8") + xlab("Number of samples in each group") + ggtitle("Number of genes of interest with sufficient power to detect\n as number of samples increases") + labs(caption="FDR=0.1,log fold change=2") + geom_text(data=subset(d, n>7),vjust=0,nudge_y=1) + scale_x_continuous(breaks=seq(1,15,2)) + scale_y_continuous(breaks=seq(0,70,10))

# Gene powers random
gene_powers_random <- sapply(n, function(x) est_power_distribution(n=x, f=0.05, rho=2, distributionObject=distribution, m=14065,m1=78))

# Power to detect musk and total power vs number of samples
d2 <- data.frame(n,musk_power,total_power,gene_powers_random) %>% tidyr::gather("type","power",musk_power,total_power,gene_powers_random)
ggplot(data=d2,aes(x=n)) + geom_line(aes(y=power,color=type)) + geom_point(aes(y=power,color=type)) + theme_bw() + ylab("Power") + xlab("Number of samples in each group") + ggtitle("Power for CDH1 gene and total power\n as number of samples increases") + labs(caption="FDR=0.1,log fold change=2,number of random genes=100") + geom_hline(yintercept=0.8,linetype='dashed') + scale_color_discrete(name="Power",breaks=c("musk_power","total_power","gene_powers_random"),labels=c("CDH1","Total for interested genes","Random genes")) + scale_x_continuous(breaks=seq(1,20,2)) + scale_y_continuous(breaks=seq(0,1,0.2))

References